#importing libraries
!pip uninstall networkx -y
!pip install networkx==2.4
# downgrading to networkx to avoid any conflict with scipy version
import numpy as np #importing numpy
import pandas as pd #importing pandas
import matplotlib.pyplot as plt #importing matplotlib for graph visualization
import networkx as nx #importing networkx for network analysis
from google.colab import files #uploading files from local
uploaded=files.upload() ##uploading files from local
Saving Keyword.csv to Keyword (7).csv
keyword_network_analysis = pd.read_csv('Keyword.csv') #reading the dataset
keyword_network_analysis.index+=1 #incrementing the index by 1 because by default the index are generated from 0
keyword_network_analysis #displaying the raw dataset which is unfiltered.
#In the next segment, dataset is refined.
| Title | Keyword 1 | Keyword 2 | Keyword 3 | Keyword 4 | Keyword 5 | Keyword 6 | Keyword 7 | Keyword 8 | Keyword 9 | Keyword 10 | Keyword 11 | Keyword 12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Feb/03 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | Meta-Analyses of Financial Performance and Equ... | EQUITY | ORGANIZATIONAL sociology | PERFORMANCE | META-analysis | PSYCHOMETRICS | ORGANIZATIONAL research | FINANCIAL performance | AGENCY theory | ORGANIZATIONAL effectiveness | ORGANIZATIONAL behavior | CORPORATE governance | NaN |
| 4 | Home Country Environments, Corporate Diversifi... | DIVERSIFICATION in industry | BUSINESS planning | PERFORMANCE standards | EMPLOYEES -- Rating of | CORPORATE culture | STRATEGIC planning | ORGANIZATIONAL effectiveness | MANAGEMENT science | MANAGEMENT research | PRODUCT management | NaN | NaN |
| 5 | Safeguarding Investments in Asymmetric Interor... | INTERORGANIZATIONAL relations | INTERGROUP relations | BUSINESS communication | INVESTMENTS | SUPPLY chains | KNOWLEDGE management | INTERORGANIZATIONAL networks | CORPORATE governance | GROUP decision making | INTELLECTUAL capital | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62 | Subsidiary Staffing in Multinational Enterpris... | INTERNATIONAL business enterprises -- Management | FOREIGN subsidiaries -- Management | EMPLOYEE selection | EXECUTIVES -- Recruiting | ORGANIZATIONAL sociology | ORGANIZATIONAL behavior | AGENCY theory | RESOURCE-based theory of the firm | PERSONNEL management | EMPLOYMENT in foreign countries | SUBSIDIARY corporations -- Management | HOST countries (Business) |
| 63 | Strategic Human Resource Practices, Top Manage... | PERSONNEL management | COMPETITIVE advantage | BUSINESS networks | INDUSTRIAL management | STRATEGIC planning | SOCIAL networks | RESOURCE management | RESOURCE-based theory of the firm | HUMAN capital -- Management | INTELLECTUAL capital | DECISION making | INDUSTRIAL efficiency |
| 64 | Compensation Policy and Organizational Perform... | COMPENSATION management | ORGANIZATIONAL behavior | PERSONNEL management | HOSPITALS -- Administration | MANAGEMENT | FINANCIAL performance | WAGE payment systems | RESOURCE management | ORGANIZATIONAL effectiveness | INDUSTRIAL efficiency | FINANCIAL management | INDUSTRIAL management |
| 65 | Functional Background Identity, Diversity, and... | CROSS-functional teams | TEAMS in the workplace | GROUP identity | ORGANIZATIONAL behavior | MANAGEMENT | PERFORMANCE | PERSONNEL management | COMPETITIVE advantage | ORGANIZATIONAL effectiveness | GROUP decision making | ORGANIZATIONAL structure | ORGANIZATIONAL sociology |
| 66 | A Customer Interaction Approach to Strategy an... | SERVICE industries -- Management | CUSTOMER relations | INDUSTRIAL management | PRODUCTION management | STRATEGIC planning | CUSTOMER services | LABOR process | ORGANIZATIONAL behavior | DECISION making | CUSTOMER satisfaction | CUSTOMER orientation | MARKETING strategy |
66 rows × 13 columns
keyword_network_analysis.dropna(subset=['Keyword 1'], inplace = True) #removing null values beginning from column 1, because while checking, automatically it checks from column 1.
#Thus, susbet is keyword 1
keyword_network_analysis #displaying the refined data after dropping null values
| Title | Keyword 1 | Keyword 2 | Keyword 3 | Keyword 4 | Keyword 5 | Keyword 6 | Keyword 7 | Keyword 8 | Keyword 9 | Keyword 10 | Keyword 11 | Keyword 12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | Meta-Analyses of Financial Performance and Equ... | EQUITY | ORGANIZATIONAL sociology | PERFORMANCE | META-analysis | PSYCHOMETRICS | ORGANIZATIONAL research | FINANCIAL performance | AGENCY theory | ORGANIZATIONAL effectiveness | ORGANIZATIONAL behavior | CORPORATE governance | NaN |
| 4 | Home Country Environments, Corporate Diversifi... | DIVERSIFICATION in industry | BUSINESS planning | PERFORMANCE standards | EMPLOYEES -- Rating of | CORPORATE culture | STRATEGIC planning | ORGANIZATIONAL effectiveness | MANAGEMENT science | MANAGEMENT research | PRODUCT management | NaN | NaN |
| 5 | Safeguarding Investments in Asymmetric Interor... | INTERORGANIZATIONAL relations | INTERGROUP relations | BUSINESS communication | INVESTMENTS | SUPPLY chains | KNOWLEDGE management | INTERORGANIZATIONAL networks | CORPORATE governance | GROUP decision making | INTELLECTUAL capital | NaN | NaN |
| 6 | Managerialist and Human Capital Explanations f... | EXECUTIVE compensation | WAGES | HUMAN capital | LABOR economics | PERSONNEL management | MANAGEMENT science | CONTINGENCY theory (Management) | COMPENSATION management | EXECUTIVE ability (Management) | CORPORATE governance | NaN | NaN |
| 7 | Bidding Wars Over R&D-Intensive Firms: Knowled... | KNOWLEDGE management | INFORMATION resources management | MANAGEMENT information systems | BREAK-even analysis | DATA mining | MANAGEMENT science | RESEARCH & development | RESEARCH & development contracts | CORPORATE governance | DECISION making | ORGANIZATIONAL behavior | TRANSACTION costs |
| 8 | When “The Show Must Go On”: Surface Acting and... | EMOTIONS (Psychology) | INTERPERSONAL relations | STRESS (Psychology) | SOCIAL interaction | SOCIAL psychology | EMPLOYEES -- Attitudes | CUSTOMER services | CUSTOMER satisfaction | JOB stress | PEER review (Professional performance) | NaN | NaN |
| 9 | Relationships among Supervisors' and Subordina... | SUPERVISORS | JUSTICE | CONFLICT management | MEDIATION | EMPLOYEES | INDUSTRIAL relations | ORGANIZATIONAL behavior | UNITED States -- National Guard | ORGANIZATIONAL effectiveness | DECISION making | RESOURCE allocation | NaN |
| 10 | Punctuated Equilibrium and Linear Progression:... | INDUSTRIAL relations | MANAGEMENT science | DECISION theory | ORGANIZATIONAL sociology | PUNCTUATED equilibrium (Evolution) | ORGANIZATIONAL change | ORGANIZATIONAL behavior | ORGANIZATIONAL structure | BUSINESS models | ORGANIZATIONAL research | NaN | NaN |
| 14 | The Relationship between Overconfidence and th... | DECISION making | EXECUTIVES | INDUSTRIAL management | NEW products | HIGH technology industries | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 15 | Governance Through Ownership: Centuries of Pra... | CORPORATE governance | INDUSTRIAL management | STOCKHOLDERS wealth | INSTITUTIONAL investors | WAGES | NEW products | ORGANIZATIONAL structure | ORGANIZATIONAL behavior | DECENTRALIZATION in management | ORGANIZATIONAL effectiveness | NaN | NaN |
| 16 | Strategic Satisficing? A Behavioral-Agency The... | EXECUTIVES | STOCKHOLDERS wealth | STOCK repurchasing | CORPORATIONS -- Finance | INCENTIVES in industry | CORPORATE governance | STRATEGIC planning | EXECUTIVE ability (Management) | AGENCY theory | ORGANIZATIONAL behavior | ORGANIZATIONAL effectiveness | NaN |
| 17 | Exploring the Agency Consequences of Ownership... | FAMILY-owned business enterprises | DEBT | DIRECTORS of corporations | AGENCY theory | ORGANIZATIONAL behavior | ORGANIZATIONAL structure | EMPLOYEE ownership | CORPORATE governance | DECISION making | BOARDS of directors | INDUSTRIAL relations | NaN |
| 18 | Institutional Ownership Differences and Intern... | INSTITUTIONAL investors | DIVERSIFICATION in industry | BUSINESS planning | GLOBALIZATION | BOARDS of directors | INTERNATIONAL business enterprises | FOREIGN investments | PENSION trusts | HIGH technology | STRATEGIC planning | TECHNOLOGICAL innovations | INNOVATION adoption |
| 19 | Ownership Structures and R&D Investments of U.... | RESEARCH & development | INVESTMENTS | PROPERTY | INCENTIVES in industry | AGENCY theory | ORGANIZATIONAL sociology | ORGANIZATIONAL structure | STEWARDS | NaN | NaN | NaN | NaN |
| 20 | The Determinants of Executive Compensation in ... | FAMILY-owned business enterprises | CHIEF executive officers | EXECUTIVE compensation | BUSINESS enterprises | RISK | MUNICIPAL corporations | CORPORATE governance | RESEARCH & development | ORGANIZATIONAL behavior | ORGANIZATIONAL structure | NaN | NaN |
| 21 | Ownership Structure, Expropriation, and Perfor... | PROPERTY | PERFORMANCE | STOCKHOLDERS | PROFIT | MINORITY stockholders | EMINENT domain | ORGANIZATIONAL effectiveness | ORGANIZATIONAL structure | CORPORATE governance | NaN | NaN | NaN |
| 22 | CEO Stock Options: The Silent Dimension of Own... | STOCK options | STOCKS (Finance) | CHIEF executive officers | STOCK ownership | EXECUTIVE compensation | EMPLOYEE stock options | ORGANIZATIONAL structure | ORGANIZATIONAL effectiveness | DECISION making | RISK management in business | NaN | NaN |
| 26 | Assessing Creativity in Hollywood Pitch Meetin... | MANAGEMENT science | DECISION making | SCREENWRITERS | CREATIVE ability | CREATIVE ability in business | SOCIAL judgment theory (Communication) | MOTION picture authorship | SELF-perception | ORGANIZATIONAL behavior | QUALITY of products | NaN | NaN |
| 27 | Reactions to Perceived Inequity in U.S. and Du... | INTERORGANIZATIONAL relations | INDUSTRIAL organization | ORGANIZATIONAL behavior | ORGANIZATIONAL effectiveness | INTERGROUP relations | ORGANIZATIONAL structure | BUSINESS networks | SUPPLIERS | STRATEGIC alliances (Business) | NaN | NaN | NaN |
| 28 | The Impact of Community Violence and an Organi... | AGGRESSION (Psychology) | VIOLENCE | SOCIAL psychology | ORGANIZATIONAL justice | WORK environment | INDUSTRIAL relations | MANAGEMENT science | VIOLENCE in the workplace | ANGER in the workplace | EMPLOYEES -- Attitudes | PROBLEM employees | WORK attitudes |
| 29 | Explaining New CEO Origin: Firm Versus Industr... | CHIEF executive officers | PERSONNEL changes | SUCCESSION planning | EXECUTIVE succession | MANAGEMENT science | EXECUTIVES -- Recruiting | STRATEGIC planning | MANAGEMENT research | EXECUTIVE ability (Management) | JOB qualifications | ORGANIZATIONAL change | NaN |
| 30 | Do High Job Demands Increase Intrinsic Motivat... | MENTAL fatigue | JOB stress | INDUSTRIAL psychology | BURNOUT (Psychology) | SOCIAL networks | PERSONNEL management | MANAGEMENT science | MOTIVATION (Psychology) | INTRINSIC motivation | JOB qualifications | ORGANIZATIONAL behavior | ORGANIZATIONAL effectiveness |
| 31 | Organizational Hiring Patterns, Interfirm Netw... | PERSONNEL management | PERSONNEL changes | MANAGEMENT science | INTERORGANIZATIONAL relations | CONTAGION (Social psychology) | TEAMS in the workplace | EXECUTIVES -- Recruiting | EMPLOYEE recruitment | ORGANIZATIONAL sociology | BUSINESS networks | INTERORGANIZATIONAL networks | NaN |
| 32 | The Effects of Centrifugal and Centripetal For... | PRODUCT management | NEW products | PROBLEM solving | QUALITY of products | DECENTRALIZATION in management | MARKETING management | MANAGEMENT science | PRODUCT design | PRODUCT lines | PRODUCT information management | ORGANIZATIONAL behavior | NaN |
| 33 | A Social Capital Model of High-Growth Ventures | SOCIAL capital (Sociology) | INFRASTRUCTURE (Economics) | VENTURE capital | INVESTMENTS | GOING public (Securities) | COMPETITIVE advantage | ENTREPRENEURSHIP | CAPITAL market | RESOURCE management | ORGANIZATIONAL effectiveness | NaN | NaN |
| 37 | Transforming Work-Family Conflict into Commitm... | ORGANIZATIONAL behavior | MULTILEVEL marketing | ORGANIZATIONAL commitment | MARKETING management | QUALITY of work life | JOB satisfaction | AMBIVALENCE | ORGANIZATIONAL structure | ORGANIZATIONAL effectiveness | ORGANIZATIONAL sociology | NaN | NaN |
| 38 | Advocacy, Performance, and Threshold Influence... | NEW products | PERFORMANCE evaluation | COMMERCIAL products | PRODUCT management | MARKETING | DECISION making | MARKETING -- Decision making | RESEARCH & development | STRATEGIC planning | PRODUCT design | NaN | NaN |
| 39 | Managing from the Boundary: The Effective Lead... | LEADERSHIP | TEAMS in the workplace | STRATEGIC planning | SELF-management (Psychology) | MANAGEMENT -- Employee participation | CRITICAL incident technique | TASK analysis | MANAGEMENT science | EXECUTIVE ability (Management) | DECISION making | NaN | NaN |
| 40 | Team Member Functional Background and Involvem... | TEAMS in the workplace | DECISION making | CRITICAL thinking | WORKFLOW | MANAGEMENT | DECENTRALIZATION in management | MANAGEMENT science | ORGANIZATIONAL behavior | DELEGATION of authority | GROUP decision making | STRATEGIC business units | NaN |
| 41 | Happy Together? How Using Nonstandard Workers ... | LABOR supply | LABOR organizing | CONDUCT of life | ORGANIZATIONAL behavior | EMPLOYEE loyalty | ORGANIZATIONAL commitment | INDUSTRIAL relations | ORGANIZATIONAL structure | EMPLOYEES -- Attitudes | PERSONNEL management | NaN | NaN |
| 42 | Interpersonal Aggression in Work Groups: Socia... | EMPLOYEES -- Attitudes | AGGRESSION (Psychology) | TEAMS in the workplace | SOCIAL influence | INDIVIDUAL differences | INTERPERSONAL relations | SOCIAL context | ORGANIZATIONAL behavior | ORGANIZATIONAL structure | WORK environment | NaN | NaN |
| 43 | Share Price Reactions to Work-Family Initiativ... | WORK & family | PERSONNEL management | STOCKHOLDERS | WOMEN employees | STOCKS (Finance) -- Prices | MANAGEMENT science | HUMAN resource accounting | WOMEN -- Employment | ORGANIZATIONAL behavior | QUALITY of work life | NaN | NaN |
| 44 | The Role of Human Capital in Postacquisition C... | HUMAN capital | CHIEF executive officers | CAPITAL investments | LABOR economics | CONSOLIDATION & merger of corporations | EXECUTIVES -- Dismissal of | ORGANIZATIONAL effectiveness | ORGANIZATIONAL behavior | LABOR turnover | EXECUTIVE succession | NaN | NaN |
| 48 | How Much Should I Give and How Often? The Effe... | SOCIAL status | GENEROSITY | BEHAVIORAL research | LABOR productivity | SOCIAL exchange | INTERPERSONAL relations | SOCIAL factors | EMPLOYEES -- Attitudes -- Research | PERSONNEL management | ORGANIZATIONAL behavior | NaN | NaN |
| 49 | Self-Concordance at Work: Toward Understanding... | LEADERSHIP | EXECUTIVE ability (Management) | EMPLOYEE motivation | MOTIVATION (Psychology) | INDUSTRIAL psychology | MANAGEMENT science | JOB satisfaction | CHARISMATIC authority | SELF-congruence | MANAGEMENT styles | NaN | NaN |
| 50 | Cooperation, Competition, and Team Performance... | EMPLOYEE motivation | JOB performance | TEAMS in the workplace | INDUSTRIAL management | PERSONNEL management | ORGANIZATIONAL sociology | INCENTIVES in industry | INDUSTRIAL psychology | GOAL setting in personnel management | REWARD (Psychology) | NaN | NaN |
| 51 | The Impact Of Expectations On Newcomer Perform... | TEAMS in the workplace | ORGANIZATIONAL sociology | EMPLOYEE motivation | LEADERSHIP | INTERPERSONAL relations | INDUSTRIAL management | PYGMALION (Greek mythology) | GALATEA, sea nymph (Greek deity) | SOCIAL exchange | OCCUPATIONAL roles | NaN | NaN |
| 52 | THe Effects of Discontinuous Change on Latent ... | ORGANIZATIONAL change | EMPLOYEE rules | HUMAN error | RISK | INDUSTRIAL management | PERSONNEL management | ORGANIZATIONAL behavior | INDUSTRIAL psychology | ORGANIZATIONAL research | ERROR rates | NaN | NaN |
| 53 | Employee Creativity in Taiwan: An Application ... | CREATIVE ability | TAIWANESE | EMPLOYEES | PERSONNEL management | EMPLOYEE motivation | CREATIVE ability in business | INNOVATION management | CROSS-cultural differences | NaN | NaN | NaN | NaN |
| 54 | Media Legitimation Effects in the Market for I... | GOING public (Securities) | CORPORATE image | STOCKHOLDERS -- Attitudes | CAPITALISTS & financiers | MASS media | CORPORATIONS -- Investor relations | MATHEMATICAL statistics | CORPORATIONS -- Public relations | PUBLIC companies | TURNOVER (Business) | NaN | NaN |
| 55 | Giving Money to Get Money: How CEO Stock Optio... | STOCK options | GOING public (Securities) | INCENTIVES in industry | OPTIONS (Finance) | CORPORATIONS -- Valuation | CORPORATIONS -- Finance | EXECUTIVE compensation | CAPITALISTS & financiers | BUSINESS enterprises -- Valuation | DECISION making | NaN | NaN |
| 59 | A Behavioral Theory of R&D Expenditures and In... | ORGANIZATIONAL behavior | CORPORATIONS -- Finance | RESEARCH & development | INDUSTRIAL management | INNOVATIONS in business | INNOVATION management | BUSINESS planning | SHIPBUILDING industry | TECHNOLOGICAL innovations -- Economic aspects | SUCCESS in business | COMPETITIVE advantage | ORGANIZATIONAL change |
| 60 | Transformational Leadership, Conservation, and... | LEADERSHIP | ORGANIZATIONAL behavior | CREATIVE ability in business | EMPLOYEE motivation | ORGANIZATIONAL change | WORK environment -- Psychological aspects | MANAGEMENT | EXECUTIVE ability (Management) | INTRINSIC motivation | INDUSTRIAL relations | INDIVIDUAL differences | NaN |
| 61 | Informational Dissimilarity and Organizational... | ORGANIZATIONAL behavior | TEAMS in the workplace | INDUSTRIAL psychology | ORGANIZATIONAL effectiveness | ORGANIZATIONAL goals | ORGANIZATIONAL sociology | SOCIAL psychology | MANAGEMENT | ORGANIZATIONAL change | DIVISION of labor | INDUSTRIAL organization | WORK environment |
| 62 | Subsidiary Staffing in Multinational Enterpris... | INTERNATIONAL business enterprises -- Management | FOREIGN subsidiaries -- Management | EMPLOYEE selection | EXECUTIVES -- Recruiting | ORGANIZATIONAL sociology | ORGANIZATIONAL behavior | AGENCY theory | RESOURCE-based theory of the firm | PERSONNEL management | EMPLOYMENT in foreign countries | SUBSIDIARY corporations -- Management | HOST countries (Business) |
| 63 | Strategic Human Resource Practices, Top Manage... | PERSONNEL management | COMPETITIVE advantage | BUSINESS networks | INDUSTRIAL management | STRATEGIC planning | SOCIAL networks | RESOURCE management | RESOURCE-based theory of the firm | HUMAN capital -- Management | INTELLECTUAL capital | DECISION making | INDUSTRIAL efficiency |
| 64 | Compensation Policy and Organizational Perform... | COMPENSATION management | ORGANIZATIONAL behavior | PERSONNEL management | HOSPITALS -- Administration | MANAGEMENT | FINANCIAL performance | WAGE payment systems | RESOURCE management | ORGANIZATIONAL effectiveness | INDUSTRIAL efficiency | FINANCIAL management | INDUSTRIAL management |
| 65 | Functional Background Identity, Diversity, and... | CROSS-functional teams | TEAMS in the workplace | GROUP identity | ORGANIZATIONAL behavior | MANAGEMENT | PERFORMANCE | PERSONNEL management | COMPETITIVE advantage | ORGANIZATIONAL effectiveness | GROUP decision making | ORGANIZATIONAL structure | ORGANIZATIONAL sociology |
| 66 | A Customer Interaction Approach to Strategy an... | SERVICE industries -- Management | CUSTOMER relations | INDUSTRIAL management | PRODUCTION management | STRATEGIC planning | CUSTOMER services | LABOR process | ORGANIZATIONAL behavior | DECISION making | CUSTOMER satisfaction | CUSTOMER orientation | MARKETING strategy |
keyword_data = keyword_network_analysis.set_index('Title').T.to_dict('list') #new variable named keyword_data which performs computation for the above refined data stored in keyword_network_analysis
analysis = keyword_network_analysis.iloc[:,1:] #new variable named analysis which stores keyword_network_analysis beginning from column 1
analysis.iloc[0,:] #location
Keyword 1 EQUITY Keyword 2 ORGANIZATIONAL sociology Keyword 3 PERFORMANCE Keyword 4 META-analysis Keyword 5 PSYCHOMETRICS Keyword 6 ORGANIZATIONAL research Keyword 7 FINANCIAL performance Keyword 8 AGENCY theory Keyword 9 ORGANIZATIONAL effectiveness Keyword 10 ORGANIZATIONAL behavior Keyword 11 CORPORATE governance Keyword 12 NaN Name: 3, dtype: object
keywords_unique = [] #new variable named keywords_unique to store empty list
for columns in analysis:#for loop to iterate a new variable named columns based on analysis- which stores the computation for the refined data
for key in analysis[columns].dropna(): #for loop to iterate a new variable named key based on analysis and removing null
if key not in keywords_unique: #if condition to check whether key is not in keywords_unique
keywords_unique.append(key) #if key is not present in keywords_unique, then append key and keywords_unique
adjacency_matrix = np.zeros((len(keywords_unique), len(keywords_unique)), dtype = int) #new variable named adjacency_matrix to calculate adjacency_matrix on length of keywords_unique
for i in range(0, 248): #for loop to iterate i in range 0 to 248
for j in range(0, 248): #for loop to iterate j in range 0 to 248
if i != j : #if condition to check i is not equal to j
if (adjacency_matrix[i][j] == 0) and (adjacency_matrix[j][i] == 0) : #if condition to check if adjacency_matrix[i][j] is equal to 0 and also adjacency_matrix[j][i] equals 0
for t in keyword_data.keys():#for loop to iterate t in keyword_data
if (keywords_unique[i] in (keyword_data[t])) and (keywords_unique[j] in (keyword_data[t])):#if condition to operate for keywords_unique[i] and keywords_unique[j] based on keyword_data[t]
adjacency_matrix[i][j] = adjacency_matrix[i][j] + 1 #computing adjacency matrix
adjacency_matrix[j][i] = adjacency_matrix[j][i] + 1 #computing adjacency matrix
name_columns= keywords_unique #new variable named name_columns which stores keywords_unique
name_rows= keywords_unique #new variable named name_rows which stores keywords_unique
weighted_network = nx.from_numpy_matrix(adjacency_matrix, parallel_edges=False) #new variable named weighted_network to read adj. matrix and compute the weighted network
plt.figure(figsize=(40,25)) #defining size of the plot
graph_position = nx.spring_layout(weighted_network) #new variable named graph_position that represents the position of the weighted_network
graph_edges = weighted_network.edges() #new variable named graph_edges that represents the edges of the weighted_network
nx.draw_networkx_nodes(weighted_network, graph_position, node_size=25, node_color='magenta', alpha=0.6) #drawing the network based on nodes having weighted_network, position of graph, color and size of node and aplha
nx.draw_networkx_edges(weighted_network, graph_position,width=0.5) #drawing the network based on edges having weighted_network, position of graph, and width
nx.draw_networkx_labels(weighted_network, graph_position, font_size=7, font_family='monospace',font_color='orange') #drawing the network based on labels having weighted_network, position of graph, color and size of node and aplha
plt.show() #displaying the network graph
node_degree = weighted_network.degree() #new variable named node_degree that stores degree of weighted_network
strength_compute = weighted_network.degree (weight = 'weight') #new variable named strength_compute that holds weighted_network.degree (weight = 'weight')
#computing degree
columns= ['TotalNodes', 'Degree'] #passing nodes and degree to columns
df_for_degree = pd.DataFrame(node_degree, columns = columns) #creting new dataframe that stores node_degree
df_for_words = pd.DataFrame(keywords_unique, columns = ['Keywords']) #creting new dataframe that stores keywords_unique having columns keywords
df_degwords = pd.merge(df_for_words, df_for_degree, left_index=True, right_index=True) #merging both the dataframes and assigning it to a new variable named df_degwords
df_degwords[['Keywords','Degree']].sort_values(by =['Degree'], ascending=False).head(10) #displaying the merged dataframe and sorting by values for top 10 nodes
| Keywords | Degree | |
|---|---|---|
| 23 | ORGANIZATIONAL behavior | 166 |
| 112 | ORGANIZATIONAL effectiveness | 104 |
| 16 | MANAGEMENT science | 102 |
| 20 | PERSONNEL management | 93 |
| 8 | DECISION making | 90 |
| 156 | ORGANIZATIONAL structure | 74 |
| 40 | ORGANIZATIONAL sociology | 66 |
| 88 | STRATEGIC planning | 66 |
| 47 | INDUSTRIAL management | 64 |
| 9 | CORPORATE governance | 62 |
#computing strength
df_strength = pd.DataFrame(strength_compute, columns = ['TotalNodes', 'Strength']) #creting new dataframe that stores strength_compute
df_res = pd.merge( df_degwords, df_strength, how="inner", on=['TotalNodes']) #merging both the dataframes( df_degwords,df_strength) and assigning it to a new variable named df_res
df_res[['Keywords','Strength']].sort_values(by =['Strength'], ascending=False).head(10) #displaying the merged dataframe and sorting by values for top 10 nodes
| Keywords | Strength | |
|---|---|---|
| 23 | ORGANIZATIONAL behavior | 265 |
| 112 | ORGANIZATIONAL effectiveness | 144 |
| 16 | MANAGEMENT science | 136 |
| 20 | PERSONNEL management | 126 |
| 8 | DECISION making | 112 |
| 156 | ORGANIZATIONAL structure | 107 |
| 40 | ORGANIZATIONAL sociology | 96 |
| 9 | CORPORATE governance | 85 |
| 47 | INDUSTRIAL management | 84 |
| 88 | STRATEGIC planning | 80 |
feature_data = pd.DataFrame() #new dataframe named feature_data to add the final data
for i in range(0, 248): #for loop to iterate i from range 0 to 248
for j in range(0, 248): #for loop to iterate j from range 0 to 248
if weighted_network.get_edge_data(i, j): #if condition to check for weighted_network.get_edge_data(i, j)
data_row = { #new variable named data_row
'Nd1': int(i), #working for node 1
'Nd2': int(j), #working for node 2
'Keyword-1':df_degwords.iat[int(i),0], #working for keyword 1
'Keyword-2':df_degwords.iat[int(j),0], #working for keyword 2
'Total_Weight': weighted_network.get_edge_data(i,j)['weight'] #computing total weight
}
feature_data=feature_data.append(data_row,ignore_index=True) #appending feature_data and data_row
feature_data[['Keyword-1','Keyword-2','Total_Weight']].sort_values(by='Total_Weight', ascending=False).head(10) #displaying the dataframe based on weights for keyword 1 and keyword 2
| Keyword-1 | Keyword-2 | Total_Weight | |
|---|---|---|---|
| 2492 | ORGANIZATIONAL effectiveness | ORGANIZATIONAL behavior | 11.0 |
| 817 | ORGANIZATIONAL behavior | ORGANIZATIONAL effectiveness | 11.0 |
| 3188 | ORGANIZATIONAL structure | ORGANIZATIONAL behavior | 9.0 |
| 848 | ORGANIZATIONAL behavior | ORGANIZATIONAL structure | 9.0 |
| 749 | ORGANIZATIONAL behavior | PERSONNEL management | 8.0 |
| 614 | PERSONNEL management | ORGANIZATIONAL behavior | 8.0 |
| 457 | MANAGEMENT science | ORGANIZATIONAL behavior | 7.0 |
| 745 | ORGANIZATIONAL behavior | MANAGEMENT science | 7.0 |
| 2554 | ORGANIZATIONAL effectiveness | ORGANIZATIONAL structure | 6.0 |
| 183 | DECISION making | ORGANIZATIONAL behavior | 6.0 |
feature_data_res = pd.merge( df_for_degree, df_strength, how="inner", on=['TotalNodes']) #merging df_for_degree and df_strength on nodes and assigning it to a new variable named feature_data_res
final = feature_data_res[['Degree','Strength']].groupby('Degree').mean().reset_index() #new variable named final which stores feature_data_res[['Degree','Strength']] and grouping based on degree.
final.index+=1 #incrementing the index by 1
final #displaying the degree and the strength, based on which the final graph will be plotted.
| Degree | Strength | |
|---|---|---|
| 1 | 4 | 4.000000 |
| 2 | 7 | 7.000000 |
| 3 | 8 | 8.000000 |
| 4 | 9 | 9.000000 |
| 5 | 10 | 10.000000 |
| 6 | 11 | 11.000000 |
| 7 | 14 | 14.500000 |
| 8 | 15 | 16.000000 |
| 9 | 16 | 18.200000 |
| 10 | 17 | 18.153846 |
| 11 | 18 | 19.200000 |
| 12 | 19 | 20.500000 |
| 13 | 20 | 20.400000 |
| 14 | 21 | 21.666667 |
| 15 | 23 | 27.500000 |
| 16 | 24 | 27.666667 |
| 17 | 25 | 27.000000 |
| 18 | 26 | 30.250000 |
| 19 | 27 | 30.000000 |
| 20 | 28 | 30.000000 |
| 21 | 29 | 31.000000 |
| 22 | 31 | 37.000000 |
| 23 | 32 | 36.000000 |
| 24 | 35 | 44.000000 |
| 25 | 37 | 45.500000 |
| 26 | 38 | 48.000000 |
| 27 | 41 | 53.000000 |
| 28 | 42 | 47.000000 |
| 29 | 46 | 57.000000 |
| 30 | 48 | 60.000000 |
| 31 | 49 | 59.000000 |
| 32 | 55 | 78.000000 |
| 33 | 62 | 85.000000 |
| 34 | 64 | 84.000000 |
| 35 | 66 | 88.000000 |
| 36 | 74 | 107.000000 |
| 37 | 90 | 112.000000 |
| 38 | 93 | 126.000000 |
| 39 | 102 | 136.000000 |
| 40 | 104 | 144.000000 |
| 41 | 166 | 265.000000 |
#plotting the graph
plt.figure(figsize=(10,8)) #defining size of the plot
plt.title('Analysis of Degree vs Average Strength') #assigning the title to the graph plot
plt.xlabel('Degree') #title for x axis
plt.ylabel('Average Strength') #assigning title for y axis
plt.scatter(final['Degree'], final['Strength']) #scatter plot, passing final['Degree'] on x axis and final['Strength'] on y axis
plt.show() #displaying the graph
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Package punkt is already up-to-date!
##############################
# since the 2020 file has data from 2016 to 2020, we are reading the files starting from 2020 till 2022
##############################
ti=[]
for i in range(2020,2023):
path = str(i) + ".csv"
fd = pd.read_csv(path)
ti.append(fd)
frame = pd.concat(ti, axis=0, ignore_index=True)
df=frame.copy() # create a copy of dataframe to work on
df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii')) # remove emojis
df['tweet'] = df['tweet'].str.replace('(\@\w+.*?)',"") # remove all the mentions of accounts
df["tweet"] = df["tweet"].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip() # remove links starting with https
df["tweet"] = df["tweet"].str.replace(r'\s*http?://\S+(\s+|$)', ' ').str.strip() # remove links starting with http
df['tweet'] = df['tweet'].str.replace('(\#\w+.*?)',"") # remove all hashtag mentions
df['tweet'] = df['tweet'].str.replace(r'[^\w\s]+', '') # remove all the remaining special characters
df['tweet'] = df['tweet'].str.replace('amp','') # remove amp word for &
df['tweet'] = df['tweet'].str.strip() # remove any empty spaces
<ipython-input-24-9ac7525f828b>:2: FutureWarning: The default value of regex will change from True to False in a future version.
df['tweet'] = df['tweet'].str.replace('(\@\w+.*?)',"") # remove all the mentions of accounts
<ipython-input-24-9ac7525f828b>:3: FutureWarning: The default value of regex will change from True to False in a future version.
df["tweet"] = df["tweet"].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip() # remove links starting with https
<ipython-input-24-9ac7525f828b>:4: FutureWarning: The default value of regex will change from True to False in a future version.
df["tweet"] = df["tweet"].str.replace(r'\s*http?://\S+(\s+|$)', ' ').str.strip() # remove links starting with http
<ipython-input-24-9ac7525f828b>:5: FutureWarning: The default value of regex will change from True to False in a future version.
df['tweet'] = df['tweet'].str.replace('(\#\w+.*?)',"") # remove all hashtag mentions
<ipython-input-24-9ac7525f828b>:6: FutureWarning: The default value of regex will change from True to False in a future version.
df['tweet'] = df['tweet'].str.replace(r'[^\w\s]+', '') # remove all the remaining special characters
# since the file 2020 has objects from 2016, we are filtering the data as per the question
df['year'] = pd.DatetimeIndex(df['date']).year
df = df.loc[df['year'] > 2016]
# remove stop words
stop = stopwords.words('english')
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in (stop)]))
# removing any empty lines based on the tweet column
df['tweet'] = df['tweet'].replace(r'^s*$', float('NaN'), regex = True)
df.dropna(subset=['tweet'], inplace=True)
df1=df[['year','tweet']]
df1.tweet=df1.tweet.map(nltk.word_tokenize).tolist() # tokenize tweets
df1
/usr/local/lib/python3.8/dist-packages/pandas/core/generic.py:5516: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self[name] = value
| year | tweet | |
|---|---|---|
| 0 | 2020 | [entertainment, critical, cars, drive] |
| 1 | 2020 | [meeting, larry, ellison, seek, advice, back, ... |
| 2 | 2020 | [absolutely] |
| 3 | 2020 | [tesla, incar, gaming, autonomous, world] |
| 4 | 2020 | [absolutely] |
| ... | ... | ... |
| 15852 | 2022 | [reminds, hex, edited, ultima, v, get, final, ... |
| 15853 | 2022 | [yay, switzerland] |
| 15854 | 2022 | [way, touch, voters, youre, three, generations... |
| 15856 | 2022 | [lets, make, roaring, 20s, happen] |
| 15857 | 2022 | [great, work, tesla, team, worldwide] |
12780 rows × 2 columns
df2 = df1.explode('tweet') # separating all the tweets to rows
df2
| year | tweet | |
|---|---|---|
| 0 | 2020 | entertainment |
| 0 | 2020 | critical |
| 0 | 2020 | cars |
| 0 | 2020 | drive |
| 1 | 2020 | meeting |
| ... | ... | ... |
| 15857 | 2022 | great |
| 15857 | 2022 | work |
| 15857 | 2022 | tesla |
| 15857 | 2022 | team |
| 15857 | 2022 | worldwide |
94685 rows × 2 columns
# calculating top 10 words for each year
df2['num'] = 1
df3 = df2.groupby(['year','tweet'],as_index=False)['num'].sum()
fi=[]
for i in range(2017,2023):
dtf = df3.loc[df3['year'] == i]
dtf = dtf.sort_values(by=['num'],ascending = False)
dtf = dtf.head(10)
fi.append(dtf)
top10 = pd.concat(fi, axis=0, ignore_index=True)
top10.sort_values(by=['year','num'],ascending=False) # print top 10 words
| year | tweet | num | |
|---|---|---|---|
| 50 | 2022 | tesla | 62 |
| 51 | 2022 | people | 34 |
| 52 | 2022 | yes | 32 |
| 53 | 2022 | would | 30 |
| 54 | 2022 | one | 30 |
| 55 | 2022 | starlink | 28 |
| 56 | 2022 | good | 28 |
| 57 | 2022 | car | 26 |
| 58 | 2022 | high | 22 |
| 59 | 2022 | great | 22 |
| 40 | 2021 | tesla | 206 |
| 41 | 2021 | great | 102 |
| 42 | 2021 | good | 98 |
| 43 | 2021 | much | 93 |
| 44 | 2021 | like | 89 |
| 45 | 2021 | haha | 89 |
| 46 | 2021 | would | 88 |
| 47 | 2021 | time | 80 |
| 48 | 2021 | beta | 77 |
| 49 | 2021 | true | 73 |
| 30 | 2020 | tesla | 188 |
| 31 | 2020 | yes | 160 |
| 32 | 2020 | great | 128 |
| 33 | 2020 | much | 120 |
| 34 | 2020 | good | 112 |
| 35 | 2020 | would | 92 |
| 36 | 2020 | sure | 88 |
| 37 | 2020 | haha | 83 |
| 38 | 2020 | yeah | 83 |
| 39 | 2020 | high | 80 |
| 20 | 2019 | tesla | 251 |
| 21 | 2019 | yes | 154 |
| 22 | 2019 | would | 98 |
| 23 | 2019 | great | 97 |
| 24 | 2019 | starship | 90 |
| 25 | 2019 | like | 87 |
| 26 | 2019 | good | 85 |
| 27 | 2019 | much | 75 |
| 28 | 2019 | high | 74 |
| 29 | 2019 | probably | 70 |
| 10 | 2018 | tesla | 286 |
| 11 | 2018 | car | 119 |
| 12 | 2018 | yes | 107 |
| 13 | 2018 | good | 103 |
| 14 | 2018 | 3 | 103 |
| 15 | 2018 | model | 98 |
| 16 | 2018 | like | 94 |
| 17 | 2018 | would | 76 |
| 18 | 2018 | dont | 74 |
| 19 | 2018 | people | 73 |
| 0 | 2017 | tesla | 71 |
| 1 | 2017 | model | 59 |
| 2 | 2017 | yes | 54 |
| 3 | 2017 | good | 53 |
| 4 | 2017 | like | 49 |
| 5 | 2017 | one | 46 |
| 6 | 2017 | first | 43 |
| 7 | 2017 | next | 43 |
| 8 | 2017 | 3 | 40 |
| 9 | 2017 | yeah | 39 |
# plot histogram for each year
for i in range(2017,2023):
dat = df3.loc[df3['year'] == i]
sns.histplot(data = dat, x = "num",bins= range(0,100,10))
plt.title("Histogram for frequency of words for the year - " + str(i),fontweight="bold", fontsize=15)
plt.show()
df5 = df3.copy()
df5['rank'] = df5.groupby(['year'])['num'].rank(ascending=False,method='first')
df5.sort_values(by=['year','num'],ascending=False)
| year | tweet | num | rank | |
|---|---|---|---|---|
| 26628 | 2022 | tesla | 62 | 1.0 |
| 26273 | 2022 | people | 34 | 2.0 |
| 26821 | 2022 | yes | 32 | 3.0 |
| 26218 | 2022 | one | 30 | 4.0 |
| 26809 | 2022 | would | 30 | 5.0 |
| ... | ... | ... | ... | ... |
| 3251 | 2017 | youth | 1 | 3254.0 |
| 3252 | 2017 | yrs | 1 | 3255.0 |
| 3254 | 2017 | zedd | 1 | 3256.0 |
| 3256 | 2017 | zeroth | 1 | 3257.0 |
| 3257 | 2017 | zootopia | 1 | 3258.0 |
26830 rows × 4 columns
# calculating total word frequency of each year
df6 = df3.groupby(['year'],as_index=False)['num'].sum()
df6 = df6.rename(columns={'num': 'total'})
df6
| year | total | |
|---|---|---|
| 0 | 2017 | 8657 |
| 1 | 2018 | 19174 |
| 2 | 2019 | 19530 |
| 3 | 2020 | 22123 |
| 4 | 2021 | 19875 |
| 5 | 2022 | 5326 |
df7 = df5.merge(df6,on=['year'],how='left') # add the total count to each word
df7['term frequency'] = df7['num'] / df7['total'] # calculate term frequency by zipf's law
df7
| year | tweet | num | rank | total | term frequency | |
|---|---|---|---|---|---|---|
| 0 | 2017 | 0 | 3 | 534.0 | 8657 | 0.000347 |
| 1 | 2017 | 008 | 1 | 1343.0 | 8657 | 0.000116 |
| 2 | 2017 | 015 | 1 | 1344.0 | 8657 | 0.000116 |
| 3 | 2017 | 040 | 1 | 1345.0 | 8657 | 0.000116 |
| 4 | 2017 | 07m | 1 | 1346.0 | 8657 | 0.000116 |
| ... | ... | ... | ... | ... | ... | ... |
| 26825 | 2022 | youre | 4 | 477.0 | 5326 | 0.000751 |
| 26826 | 2022 | youtube | 2 | 1526.0 | 5326 | 0.000376 |
| 26827 | 2022 | yup | 10 | 83.0 | 5326 | 0.001878 |
| 26828 | 2022 | zealand | 2 | 1527.0 | 5326 | 0.000376 |
| 26829 | 2022 | zukunft | 2 | 1528.0 | 5326 | 0.000376 |
26830 rows × 6 columns
# scalling to rank and term frequency to log
df8=df7.copy()
df8['log_tf'] = np.log(df8['term frequency'])
df8['log_rank'] = np.log(df8['rank'])
# plot log-log plots of word frequencies and rank for each year
sns.set(rc={'figure.figsize':(11,11)})
sns.lineplot(data = df8,x='log_rank',y="log_tf", hue = 'year',palette='hls')
plt.title(' log-log plots of word frequencies and rank for each year',fontweight="bold")
plt.show()
gram=df[['year','tweet']]
gram['bigrams'] = df1['tweet'].apply(lambda x: list(nltk.ngrams(x,2))) # calculating bigrams
<ipython-input-39-e6b0bbcee79a>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy gram['bigrams'] = df1['tweet'].apply(lambda x: list(nltk.ngrams(x,2))) # calculating bigrams
bic = gram[['year','bigrams']]
bic = bic.explode('bigrams') # separate the bigrams to different rows
bic.dropna(subset=['bigrams'],inplace=True) # dropping any empty rows
# plotting bigram network graphs for each year
def networkgraph(thresh,year):
bic3 = bic.loc[bic['year'] == year] # filter on year
bic3 = bic3[['bigrams']]
bic3.reset_index(drop=True, inplace=True)
bic4 = pd.DataFrame(bic3["bigrams"].to_list(), columns=['word1', 'word2']) # separating the bigrams to different columns
bic4['num'] = 1
bic4 = bic4.groupby(['word1','word2'],as_index=False)['num'].sum() # count pairs
bic4.sort_values(by='num',ascending=False)
bic4 = bic4.loc[bic4['num'] >=thresh] # consider pairs with count more than threshold
G = nx.from_pandas_edgelist(bic4,source='word1',target='word2') # create graph from dataframe
plt.figure(figsize=(40,30)) # set figure size
nx.draw(G,with_labels= True) # plot the graph
networkgraph(2,2017) # network graph for year 2017
networkgraph(2,2018) # network graph for year 2018
networkgraph(2,2019) # network graph for year 2019
networkgraph(2,2020) # network graph for year 2020
networkgraph(2,2021) # network graph for year 2021
networkgraph(2,2022) # network graph for year 2022